123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392 |
- """Module for supporting the lxml.etree library. The idea here is to use as much
- of the native library as possible, without using fragile hacks like custom element
- names that break between releases. The downside of this is that we cannot represent
- all possible trees; specifically the following are known to cause problems:
- Text or comments as siblings of the root element
- Docypes with no name
- When any of these things occur, we emit a DataLossWarning
- """
- from __future__ import absolute_import, division, unicode_literals
- # pylint:disable=protected-access
- import warnings
- import re
- import sys
- try:
- from collections.abc import MutableMapping
- except ImportError:
- from collections import MutableMapping
- from . import base
- from ..constants import DataLossWarning
- from .. import constants
- from . import etree as etree_builders
- from .. import _ihatexml
- import lxml.etree as etree
- from pip._vendor.six import PY3, binary_type
- fullTree = True
- tag_regexp = re.compile("{([^}]*)}(.*)")
- comment_type = etree.Comment("asd").tag
- class DocumentType(object):
- def __init__(self, name, publicId, systemId):
- self.name = name
- self.publicId = publicId
- self.systemId = systemId
- class Document(object):
- def __init__(self):
- self._elementTree = None
- self._childNodes = []
- def appendChild(self, element):
- last = self._elementTree.getroot()
- for last in self._elementTree.getroot().itersiblings():
- pass
- last.addnext(element._element)
- def _getChildNodes(self):
- return self._childNodes
- childNodes = property(_getChildNodes)
- def testSerializer(element):
- rv = []
- infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
- def serializeElement(element, indent=0):
- if not hasattr(element, "tag"):
- if hasattr(element, "getroot"):
- # Full tree case
- rv.append("#document")
- if element.docinfo.internalDTD:
- if not (element.docinfo.public_id or
- element.docinfo.system_url):
- dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
- else:
- dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
- element.docinfo.root_name,
- element.docinfo.public_id,
- element.docinfo.system_url)
- rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
- next_element = element.getroot()
- while next_element.getprevious() is not None:
- next_element = next_element.getprevious()
- while next_element is not None:
- serializeElement(next_element, indent + 2)
- next_element = next_element.getnext()
- elif isinstance(element, str) or isinstance(element, bytes):
- # Text in a fragment
- assert isinstance(element, str) or sys.version_info[0] == 2
- rv.append("|%s\"%s\"" % (' ' * indent, element))
- else:
- # Fragment case
- rv.append("#document-fragment")
- for next_element in element:
- serializeElement(next_element, indent + 2)
- elif element.tag == comment_type:
- rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
- if hasattr(element, "tail") and element.tail:
- rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
- else:
- assert isinstance(element, etree._Element)
- nsmatch = etree_builders.tag_regexp.match(element.tag)
- if nsmatch is not None:
- ns = nsmatch.group(1)
- tag = nsmatch.group(2)
- prefix = constants.prefixes[ns]
- rv.append("|%s<%s %s>" % (' ' * indent, prefix,
- infosetFilter.fromXmlName(tag)))
- else:
- rv.append("|%s<%s>" % (' ' * indent,
- infosetFilter.fromXmlName(element.tag)))
- if hasattr(element, "attrib"):
- attributes = []
- for name, value in element.attrib.items():
- nsmatch = tag_regexp.match(name)
- if nsmatch is not None:
- ns, name = nsmatch.groups()
- name = infosetFilter.fromXmlName(name)
- prefix = constants.prefixes[ns]
- attr_string = "%s %s" % (prefix, name)
- else:
- attr_string = infosetFilter.fromXmlName(name)
- attributes.append((attr_string, value))
- for name, value in sorted(attributes):
- rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
- if element.text:
- rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
- indent += 2
- for child in element:
- serializeElement(child, indent)
- if hasattr(element, "tail") and element.tail:
- rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
- serializeElement(element, 0)
- return "\n".join(rv)
- def tostring(element):
- """Serialize an element and its child nodes to a string"""
- rv = []
- def serializeElement(element):
- if not hasattr(element, "tag"):
- if element.docinfo.internalDTD:
- if element.docinfo.doctype:
- dtd_str = element.docinfo.doctype
- else:
- dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
- rv.append(dtd_str)
- serializeElement(element.getroot())
- elif element.tag == comment_type:
- rv.append("<!--%s-->" % (element.text,))
- else:
- # This is assumed to be an ordinary element
- if not element.attrib:
- rv.append("<%s>" % (element.tag,))
- else:
- attr = " ".join(["%s=\"%s\"" % (name, value)
- for name, value in element.attrib.items()])
- rv.append("<%s %s>" % (element.tag, attr))
- if element.text:
- rv.append(element.text)
- for child in element:
- serializeElement(child)
- rv.append("</%s>" % (element.tag,))
- if hasattr(element, "tail") and element.tail:
- rv.append(element.tail)
- serializeElement(element)
- return "".join(rv)
- class TreeBuilder(base.TreeBuilder):
- documentClass = Document
- doctypeClass = DocumentType
- elementClass = None
- commentClass = None
- fragmentClass = Document
- implementation = etree
- def __init__(self, namespaceHTMLElements, fullTree=False):
- builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
- infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
- self.namespaceHTMLElements = namespaceHTMLElements
- class Attributes(MutableMapping):
- def __init__(self, element):
- self._element = element
- def _coerceKey(self, key):
- if isinstance(key, tuple):
- name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
- else:
- name = infosetFilter.coerceAttribute(key)
- return name
- def __getitem__(self, key):
- value = self._element._element.attrib[self._coerceKey(key)]
- if not PY3 and isinstance(value, binary_type):
- value = value.decode("ascii")
- return value
- def __setitem__(self, key, value):
- self._element._element.attrib[self._coerceKey(key)] = value
- def __delitem__(self, key):
- del self._element._element.attrib[self._coerceKey(key)]
- def __iter__(self):
- return iter(self._element._element.attrib)
- def __len__(self):
- return len(self._element._element.attrib)
- def clear(self):
- return self._element._element.attrib.clear()
- class Element(builder.Element):
- def __init__(self, name, namespace):
- name = infosetFilter.coerceElement(name)
- builder.Element.__init__(self, name, namespace=namespace)
- self._attributes = Attributes(self)
- def _setName(self, name):
- self._name = infosetFilter.coerceElement(name)
- self._element.tag = self._getETreeTag(
- self._name, self._namespace)
- def _getName(self):
- return infosetFilter.fromXmlName(self._name)
- name = property(_getName, _setName)
- def _getAttributes(self):
- return self._attributes
- def _setAttributes(self, value):
- attributes = self.attributes
- attributes.clear()
- attributes.update(value)
- attributes = property(_getAttributes, _setAttributes)
- def insertText(self, data, insertBefore=None):
- data = infosetFilter.coerceCharacters(data)
- builder.Element.insertText(self, data, insertBefore)
- def cloneNode(self):
- element = type(self)(self.name, self.namespace)
- if self._element.attrib:
- element._element.attrib.update(self._element.attrib)
- return element
- class Comment(builder.Comment):
- def __init__(self, data):
- data = infosetFilter.coerceComment(data)
- builder.Comment.__init__(self, data)
- def _setData(self, data):
- data = infosetFilter.coerceComment(data)
- self._element.text = data
- def _getData(self):
- return self._element.text
- data = property(_getData, _setData)
- self.elementClass = Element
- self.commentClass = Comment
- # self.fragmentClass = builder.DocumentFragment
- base.TreeBuilder.__init__(self, namespaceHTMLElements)
- def reset(self):
- base.TreeBuilder.reset(self)
- self.insertComment = self.insertCommentInitial
- self.initial_comments = []
- self.doctype = None
- def testSerializer(self, element):
- return testSerializer(element)
- def getDocument(self):
- if fullTree:
- return self.document._elementTree
- else:
- return self.document._elementTree.getroot()
- def getFragment(self):
- fragment = []
- element = self.openElements[0]._element
- if element.text:
- fragment.append(element.text)
- fragment.extend(list(element))
- if element.tail:
- fragment.append(element.tail)
- return fragment
- def insertDoctype(self, token):
- name = token["name"]
- publicId = token["publicId"]
- systemId = token["systemId"]
- if not name:
- warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
- self.doctype = None
- else:
- coercedName = self.infosetFilter.coerceElement(name)
- if coercedName != name:
- warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
- doctype = self.doctypeClass(coercedName, publicId, systemId)
- self.doctype = doctype
- def insertCommentInitial(self, data, parent=None):
- assert parent is None or parent is self.document
- assert self.document._elementTree is None
- self.initial_comments.append(data)
- def insertCommentMain(self, data, parent=None):
- if (parent == self.document and
- self.document._elementTree.getroot()[-1].tag == comment_type):
- warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
- super(TreeBuilder, self).insertComment(data, parent)
- def insertRoot(self, token):
- # Because of the way libxml2 works, it doesn't seem to be possible to
- # alter information like the doctype after the tree has been parsed.
- # Therefore we need to use the built-in parser to create our initial
- # tree, after which we can add elements like normal
- docStr = ""
- if self.doctype:
- assert self.doctype.name
- docStr += "<!DOCTYPE %s" % self.doctype.name
- if (self.doctype.publicId is not None or
- self.doctype.systemId is not None):
- docStr += (' PUBLIC "%s" ' %
- (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
- if self.doctype.systemId:
- sysid = self.doctype.systemId
- if sysid.find("'") >= 0 and sysid.find('"') >= 0:
- warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
- sysid = sysid.replace("'", 'U00027')
- if sysid.find("'") >= 0:
- docStr += '"%s"' % sysid
- else:
- docStr += "'%s'" % sysid
- else:
- docStr += "''"
- docStr += ">"
- if self.doctype.name != token["name"]:
- warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
- docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
- root = etree.fromstring(docStr)
- # Append the initial comments:
- for comment_token in self.initial_comments:
- comment = self.commentClass(comment_token["data"])
- root.addprevious(comment._element)
- # Create the root document and add the ElementTree to it
- self.document = self.documentClass()
- self.document._elementTree = root.getroottree()
- # Give the root element the right name
- name = token["name"]
- namespace = token.get("namespace", self.defaultNamespace)
- if namespace is None:
- etree_tag = name
- else:
- etree_tag = "{%s}%s" % (namespace, name)
- root.tag = etree_tag
- # Add the root element to the internal child/open data structures
- root_element = self.elementClass(name, namespace)
- root_element._element = root
- self.document._childNodes.append(root_element)
- self.openElements.append(root_element)
- # Reset to the default insert comment function
- self.insertComment = self.insertCommentMain
|